;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.21 2005/02/02 17:47:38 edi Exp $

;;; This file defines the REGEX class and some utility methods for
;;; this class. REGEX objects are used to represent the (transformed)
;;; parse trees internally

;;; Copyright (c) 2002-2004, Dr. Edmund Weitz. All rights reserved.

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

(in-package #:cl-ppcre)

;; Genera need the eval-when, here, or the types created by the class
;; definitions aren't seen by the typep calls later in the file.
(eval-when (:compile-toplevel :load-toplevel :execute)
  (locally
    (declare (optimize speed
                       (safety 0)
                       (space 0)
                       (debug 0)
                       (compilation-speed 0)
                       #+:lispworks (hcl:fixnum-safety 0)))
    (defclass regex ()
         ()
      (:documentation "The REGEX base class. All other classes inherit
from this one."))


    (defclass seq (regex)
         ((elements :initarg :elements
                    :accessor elements
                    :type cons
                    :documentation "A list of REGEX objects."))
      (:documentation "SEQ objects represents sequences of
regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))

    (defclass alternation (regex)
         ((choices :initarg :choices
                   :accessor choices
                   :type cons
                   :documentation "A list of REGEX objects"))
      (:documentation "ALTERNATION objects represent alternations of
regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))

    (defclass lookahead (regex)
         ((regex :initarg :regex
                 :accessor regex
                 :documentation "The REGEX object we're checking.")
          (positivep :initarg :positivep
                     :reader positivep
                     :documentation "Whether this assertion is positive."))
      (:documentation "LOOKAHEAD objects represent look-ahead assertions."))

    (defclass lookbehind (regex)
         ((regex :initarg :regex
                 :accessor regex
                 :documentation "The REGEX object we're checking.")
          (positivep :initarg :positivep
                     :reader positivep
                     :documentation "Whether this assertion is positive.")
          (len :initarg :len
               :accessor len
               :type fixnum
               :documentation "The (fixed) length of the enclosed regex."))
      (:documentation "LOOKBEHIND objects represent look-behind assertions."))

    (defclass repetition (regex)
         ((regex :initarg :regex
                 :accessor regex
                 :documentation "The REGEX that's repeated.")
          (greedyp :initarg :greedyp
                   :reader greedyp
                   :documentation "Whether the repetition is greedy.")
          (minimum :initarg :minimum
                   :accessor minimum
                   :type fixnum
                   :documentation "The minimal number of repetitions.")
          (maximum :initarg :maximum
                   :accessor maximum
                   :documentation "The maximal number of repetitions.
Can be NIL for unbounded.")
          (min-len :initarg :min-len
                   :reader min-len
                   :documentation "The minimal length of the enclosed regex.")
          (len :initarg :len
               :reader len
               :documentation "The length of the enclosed regex. NIL
if unknown.")
          (min-rest :initform 0
                    :accessor min-rest
                    :type fixnum
                    :documentation "The minimal number of characters which must
appear after this repetition.")
          (contains-register-p :initarg :contains-register-p
                               :reader contains-register-p
                               :documentation "If the regex contains a register."))
      (:documentation "REPETITION objects represent repetitions of regexes."))

    (defclass register (regex)
         ((regex :initarg :regex
                 :accessor regex
                 :documentation "The inner regex.")
          (num :initarg :num
               :reader num
               :type fixnum
               :documentation "The number of this register, starting from 0.
This is the index into *REGS-START* and *REGS-END*."))
      (:documentation "REGISTER objects represent register groups."))

    (defclass standalone (regex)
         ((regex :initarg :regex
                 :accessor regex
                 :documentation "The inner regex."))
      (:documentation "A standalone regular expression."))
  
    (defclass back-reference (regex)
         ((num :initarg :num
               :accessor num
               :type fixnum
               :documentation "The number of the register this
reference refers to.")
          (case-insensitive-p :initarg :case-insensitive-p
                              :reader case-insensitive-p
                              :documentation "Whether we check
case-insensitively."))
      (:documentation "BACK-REFERENCE objects represent backreferences."))

    (defclass char-class (regex)
         ((hash :initarg :hash
                :reader hash
                :type (or hash-table null)
                :documentation "A hash table the keys of which are the
characters; the values are always T.")
          (case-insensitive-p :initarg :case-insensitive-p
                              :reader case-insensitive-p
                              :documentation "If the char class
case-insensitive.")
          (invertedp :initarg :invertedp
                     :reader invertedp
                     :documentation "Whether we mean the inverse of
the char class.")
          (word-char-class-p :initarg :word-char-class-p
                             :reader word-char-class-p
                             :documentation "Whether this CHAR CLASS
represents the special class WORD-CHAR-CLASS."))
      (:documentation "CHAR-CLASS objects represent character classes."))

    (defclass str (regex)
         ((str :initarg :str
               :accessor str
               :type string
               :documentation "The actual string.")
          (len :initform 0
               :accessor len
               :type fixnum
               :documentation "The length of the string.")
          (case-insensitive-p :initarg :case-insensitive-p
                              :reader case-insensitive-p
                              :documentation "If we match case-insensitively.")
          (offset :initform nil
                  :accessor offset
                  :documentation "Offset from the left of the whole
parse tree. The first regex has offset 0. NIL if unknown, i.e. behind
a variable-length regex.")
          (skip :initform nil
                :initarg :skip
                :accessor skip
                :documentation "If we can avoid testing for this
string because the SCAN function has done this already.")
          (start-of-end-string-p :initform nil
                                 :accessor start-of-end-string-p
                                 :documentation "If this is the unique
STR which starts END-STRING (a slot of MATCHER)."))
      (:documentation "STR objects represent string."))

    (defclass anchor (regex)
         ((startp :initarg :startp
                  :reader startp
                  :documentation "Whether this is a \"start anchor\".")
          (multi-line-p :initarg :multi-line-p
                        :reader multi-line-p
                        :documentation "Whether we're in multi-line mode,
i.e. whether each #\\Newline is surrounded by anchors.")
          (no-newline-p :initarg :no-newline-p
                        :reader no-newline-p
                        :documentation "Whether we ignore #\\Newline at the end."))
      (:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))

    (defclass everything (regex)
         ((single-line-p :initarg :single-line-p
                         :reader single-line-p
                         :documentation "Whether we're in single-line mode,
i.e. whether we also match #\\Newline."))
      (:documentation "EVERYTHING objects represent regexes matching
\"everything\", i.e. dots."))

    (defclass word-boundary (regex)
         ((negatedp :initarg :negatedp
                    :reader negatedp
                    :documentation "Whether we mean the opposite,
i.e. no word-boundary."))
      (:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))

    (defclass branch (regex)
         ((test :initarg :test
                :accessor test
                :documentation "The test of this branch, one of
LOOKAHEAD, LOOKBEHIND, or a number.")
          (then-regex :initarg :then-regex
                      :accessor then-regex
                      :documentation "The regex that's to be matched if the
test succeeds.")
          (else-regex :initarg :else-regex
                      :initform (make-instance 'void)
                      :accessor else-regex
                      :documentation "The regex that's to be matched if the
test fails."))
      (:documentation "BRANCH objects represent Perl's conditional regular
expressions."))
    
    (defclass filter (regex)
         ((fn :initarg :fn
              :accessor fn
              :type (or function symbol)
              :documentation "The user-defined function.")
          (len :initarg :len
               :reader len
               :documentation "The fixed length of this filter or NIL."))
      (:documentation "FILTER objects represent arbitrary functions
defined by the user."))

    (defclass void (regex)
         ()
      (:documentation "VOID objects represent empty regular expressions."))))

(defmethod initialize-instance :after ((char-class char-class) &rest init-args)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  "Make large hash tables smaller, if possible."
  (let ((hash (getf init-args :hash)))
    (when (and hash
               (> *regex-char-code-limit* 256)
               (> (hash-table-count hash)
                  (/ *regex-char-code-limit* 2)))
      (setf (slot-value char-class 'hash)
              (merge-inverted-hash (make-hash-table)
                                   hash)
            (slot-value char-class 'invertedp)
              (not (slot-value char-class 'invertedp))))))

(declaim (ftype (function (t) simple-string) str))

;;; The following four methods allow a VOID object to behave like a
;;; zero-length STR object (only readers needed)

(defmethod initialize-instance :after ((str str) &rest init-args)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (declare (ignore init-args))
  "Automatically computes the length of a STR after initialization."
  (let ((str-slot (slot-value str 'str)))
    (unless (typep str-slot 'simple-string)
      (setf (slot-value str 'str) (coerce str-slot 'simple-string))))
  (setf (len str) (length (str str))))

(defmethod len ((void void))
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  0)

(defmethod str ((void void))
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  "")

(defmethod skip ((void void))
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  nil)

(defmethod start-of-end-string-p ((void void))
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  nil)

(defgeneric case-mode (regex old-case-mode)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
Returns a keyword denoting the case-(in)sensitivity of a STR or its
second argument if the STR has length 0. Returns NIL for REGEX objects
which are not of type STR."))

(defmethod case-mode ((str str) old-case-mode)
  (cond ((zerop (len str))
          old-case-mode)
        ((case-insensitive-p str)
          :case-insensitive)
        (t
          :case-sensitive)))

(defmethod case-mode ((regex regex) old-case-mode)
  (declare (ignore old-case-mode))
  nil)

(defgeneric copy-regex (regex)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Implements a deep copy of a REGEX object."))

(defmethod copy-regex ((anchor anchor))
  (make-instance 'anchor
                 :startp (startp anchor)
                 :multi-line-p (multi-line-p anchor)
                 :no-newline-p (no-newline-p anchor)))

(defmethod copy-regex ((everything everything))
  (make-instance 'everything
                 :single-line-p (single-line-p everything)))

(defmethod copy-regex ((word-boundary word-boundary))
  (make-instance 'word-boundary
                 :negatedp (negatedp word-boundary)))

(defmethod copy-regex ((void void))
  (make-instance 'void))

(defmethod copy-regex ((lookahead lookahead))
  (make-instance 'lookahead
                 :regex (copy-regex (regex lookahead))
                 :positivep (positivep lookahead)))

(defmethod copy-regex ((seq seq))
  (make-instance 'seq
                 :elements (mapcar #'copy-regex (elements seq))))

(defmethod copy-regex ((alternation alternation))
  (make-instance 'alternation
                 :choices (mapcar #'copy-regex (choices alternation))))

(defmethod copy-regex ((branch branch))
  (with-slots ((test test))
      branch
    (make-instance 'branch
                   :test (if (typep test 'regex)
                           (copy-regex test)
                           test)
                   :then-regex (copy-regex (then-regex branch))
                   :else-regex (copy-regex (else-regex branch)))))

(defmethod copy-regex ((lookbehind lookbehind))
  (make-instance 'lookbehind
                 :regex (copy-regex (regex lookbehind))
                 :positivep (positivep lookbehind)
                 :len (len lookbehind)))

(defmethod copy-regex ((repetition repetition))
  (make-instance 'repetition
                 :regex (copy-regex (regex repetition))
                 :greedyp (greedyp repetition)
                 :minimum (minimum repetition)
                 :maximum (maximum repetition)
                 :min-len (min-len repetition)
                 :len (len repetition)
                 :contains-register-p (contains-register-p repetition)))

(defmethod copy-regex ((register register))
  (make-instance 'register
                 :regex (copy-regex (regex register))
                 :num (num register)))

(defmethod copy-regex ((standalone standalone))
  (make-instance 'standalone
                 :regex (copy-regex (regex standalone))))

(defmethod copy-regex ((back-reference back-reference))
  (make-instance 'back-reference
                 :num (num back-reference)
                 :case-insensitive-p (case-insensitive-p back-reference)))

(defmethod copy-regex ((char-class char-class))
  (make-instance 'char-class
                 :hash (hash char-class)
                 :case-insensitive-p (case-insensitive-p char-class)
                 :invertedp (invertedp char-class)
                 :word-char-class-p (word-char-class-p char-class)))

(defmethod copy-regex ((str str))
  (make-instance 'str
                 :str (str str)
                 :case-insensitive-p (case-insensitive-p str)))

(defmethod copy-regex ((filter filter))
  (make-instance 'filter
                 :fn (fn filter)
                 :len (len filter)))

;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
;;; wrapped into one function. Maybe in the next release...

;;; Further note that this function is used by CONVERT to factor out
;;; complicated repetitions, i.e. cases like
;;;   (a)* -> (?:a*(a))?
;;; This won't work for, say,
;;;   ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.

(defgeneric remove-registers (regex)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
optionally removes embedded REGISTER objects if possible and if the
special variable REMOVE-REGISTERS-P is true."))

(defmethod remove-registers ((register register))
  (declare (special remove-registers-p reg-seen))
  (cond (remove-registers-p
          (remove-registers (regex register)))
        (t
          ;; mark REG-SEEN as true so enclosing REPETITION objects
          ;; (see method below) know if they contain a register or not
          (setq reg-seen t)
          (copy-regex register))))

(defmethod remove-registers ((repetition repetition))
  (let* (reg-seen
         (inner-regex (remove-registers (regex repetition))))
    ;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
    ;; (REGEX REPETITION) contains a REGISTER
    (declare (special reg-seen))
    (make-instance 'repetition
                   :regex inner-regex
                   :greedyp (greedyp repetition)
                   :minimum (minimum repetition)
                   :maximum (maximum repetition)
                   :min-len (min-len repetition)
                   :len (len repetition)
                   :contains-register-p reg-seen)))

(defmethod remove-registers ((standalone standalone))
  (make-instance 'standalone
                 :regex (remove-registers (regex standalone))))

(defmethod remove-registers ((lookahead lookahead))
  (make-instance 'lookahead
                 :regex (remove-registers (regex lookahead))
                 :positivep (positivep lookahead)))

(defmethod remove-registers ((lookbehind lookbehind))
  (make-instance 'lookbehind
                 :regex (remove-registers (regex lookbehind))
                 :positivep (positivep lookbehind)
                 :len (len lookbehind)))

(defmethod remove-registers ((branch branch))
  (with-slots ((test test))
      branch
    (make-instance 'branch
                   :test (if (typep test 'regex)
                           (remove-registers test)
                           test)
                   :then-regex (remove-registers (then-regex branch))
                   :else-regex (remove-registers (else-regex branch)))))

(defmethod remove-registers ((alternation alternation))
  (declare (special remove-registers-p))
  ;; an ALTERNATION, so we can't remove REGISTER objects further down
  (setq remove-registers-p nil)
  (copy-regex alternation))

(defmethod remove-registers ((regex regex))
  (copy-regex regex))

(defmethod remove-registers ((seq seq))
  (make-instance 'seq
                 :elements (mapcar #'remove-registers (elements seq))))

(defgeneric everythingp (regex)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Returns an EVERYTHING object if REGEX is equivalent
to this object, otherwise NIL. So, \"(.){1}\" would return true
(i.e. the object corresponding to \".\", for example."))

(defmethod everythingp ((seq seq))
  ;; we might have degenerate cases like (:SEQUENCE :VOID ...)
  ;; due to the parsing process
  (let ((cleaned-elements (remove-if #'(lambda (element)
                                         (typep element 'void))
                                     (elements seq))))
    (and (= 1 (length cleaned-elements))
         (everythingp (first cleaned-elements)))))

(defmethod everythingp ((alternation alternation))
  (with-slots ((choices choices))
      alternation
    (and (= 1 (length choices))
         ;; this is unlikely to happen for human-generated regexes,
         ;; but machine-generated ones might look like this
         (everythingp (first choices)))))

(defmethod everythingp ((repetition repetition))
  (with-slots ((maximum maximum)
               (minimum minimum)
               (regex regex))
      repetition
    (and maximum
         (= 1 minimum maximum)
         ;; treat "<regex>{1,1}" like "<regex>"
         (everythingp regex))))

(defmethod everythingp ((register register))
  (everythingp (regex register)))

(defmethod everythingp ((standalone standalone))
  (everythingp (regex standalone)))

(defmethod everythingp ((everything everything))
  everything)

(defmethod everythingp ((regex regex))
  ;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
  ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
  nil)

(defgeneric regex-length (regex)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))

(defmethod regex-length ((seq seq))
  ;; simply add all inner lengths unless one of them is NIL
  (loop for sub-regex in (elements seq)
        for len = (regex-length sub-regex)
        if (not len) do (return nil)
        sum len))

(defmethod regex-length ((alternation alternation))
  ;; only return a true value if all inner lengths are non-NIL and
  ;; mutually equal
  (loop for sub-regex in (choices alternation)
        for old-len = nil then len
        for len = (regex-length sub-regex)
        if (or (not len)
               (and old-len (/= len old-len))) do (return nil)
        finally (return len)))

(defmethod regex-length ((branch branch))
  ;; only return a true value if both alternations have a length and
  ;; if they're equal
  (let ((then-length (regex-length (then-regex branch))))
    (and then-length
         (eql then-length (regex-length (else-regex branch)))
         then-length)))

(defmethod regex-length ((repetition repetition))
  ;; we can only compute the length of a REPETITION object if the
  ;; number of repetitions is fixed; note that we don't call
  ;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
  ;; always set correctly
  (with-slots ((len len)
               (minimum minimum)
               (maximum maximum))
      repetition
    (if (and len
             (eq minimum maximum))
      (* minimum len)
      nil)))

(defmethod regex-length ((register register))
  (regex-length (regex register)))

(defmethod regex-length ((standalone standalone))
  (regex-length (regex standalone)))

(defmethod regex-length ((back-reference back-reference))
  ;; with enough effort we could possibly do better here, but
  ;; currently we just give up and return NIL
  nil)
    
(defmethod regex-length ((char-class char-class))
  1)

(defmethod regex-length ((everything everything))
  1)

(defmethod regex-length ((str str))
  (len str))

(defmethod regex-length ((filter filter))
  (len filter))

(defmethod regex-length ((regex regex))
  ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
  ;; WORD-BOUNDARY (which all have zero-length)
  0)

(defgeneric regex-min-length (regex)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Returns the minimal length of REGEX."))

(defmethod regex-min-length ((seq seq))
  ;; simply add all inner minimal lengths
  (loop for sub-regex in (elements seq)
        for len = (regex-min-length sub-regex)
        sum len))

(defmethod regex-min-length ((alternation alternation))
  ;; minimal length of an alternation is the minimal length of the
  ;; "shortest" element
  (loop for sub-regex in (choices alternation)
        for len = (regex-min-length sub-regex)
        minimize len))

(defmethod regex-min-length ((branch branch))
  ;; minimal length of both alternations
  (min (regex-min-length (then-regex branch))
       (regex-min-length (else-regex branch))))

(defmethod regex-min-length ((repetition repetition))
  ;; obviously the product of the inner minimal length and the minimal
  ;; number of repetitions
  (* (minimum repetition) (min-len repetition)))
    
(defmethod regex-min-length ((register register))
  (regex-min-length (regex register)))
    
(defmethod regex-min-length ((standalone standalone))
  (regex-min-length (regex standalone)))
    
(defmethod regex-min-length ((char-class char-class))
  1)

(defmethod regex-min-length ((everything everything))
  1)

(defmethod regex-min-length ((str str))
  (len str))
    
(defmethod regex-min-length ((filter filter))
  (or (len filter)
      0))

(defmethod regex-min-length ((regex regex))
  ;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
  ;; LOOKBEHIND, VOID, and WORD-BOUNDARY
  0)

(defgeneric compute-offsets (regex start-pos)
  (declare (optimize speed
                     (safety 0)
                     (space 0)
                     (debug 0)
                     (compilation-speed 0)
                     #+:lispworks (hcl:fixnum-safety 0)))
  (:documentation "Returns the offset the following regex would have
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
slots of STR objects further down the tree."))

;; note that we're actually only interested in the offset of
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
;; can stop at variable-length alternations and don't need to descend
;; into repetitions

(defmethod compute-offsets ((seq seq) start-pos)
  (loop for element in (elements seq)
        ;; advance offset argument for next call while looping through
        ;; the elements
        for pos = start-pos then curr-offset
        for curr-offset = (compute-offsets element pos)
        while curr-offset
        finally (return curr-offset)))

(defmethod compute-offsets ((alternation alternation) start-pos)
  (loop for choice in (choices alternation)
        for old-offset = nil then curr-offset
        for curr-offset = (compute-offsets choice start-pos)
        ;; we stop immediately if two alternations don't result in the
        ;; same offset
        if (or (not curr-offset)
               (and old-offset (/= curr-offset old-offset)))
          do (return nil)
        finally (return curr-offset)))

(defmethod compute-offsets ((branch branch) start-pos)
  ;; only return offset if both alternations have equal value
  (let ((then-offset (compute-offsets (then-regex branch) start-pos)))
    (and then-offset
         (eql then-offset (compute-offsets (else-regex branch) start-pos))
         then-offset)))

(defmethod compute-offsets ((repetition repetition) start-pos)
  ;; no need to descend into the inner regex
  (with-slots ((len len)
               (minimum minimum)
               (maximum maximum))
      repetition
    (if (and len
             (eq minimum maximum))
      ;; fixed number of repetitions, so we know how to proceed
      (+ start-pos (* minimum len))
      ;; otherwise return NIL
      nil)))

(defmethod compute-offsets ((register register) start-pos)
  (compute-offsets (regex register) start-pos))
    
(defmethod compute-offsets ((standalone standalone) start-pos)
  (compute-offsets (regex standalone) start-pos))
    
(defmethod compute-offsets ((char-class char-class) start-pos)
  (1+ start-pos))
    
(defmethod compute-offsets ((everything everything) start-pos)
  (1+ start-pos))
    
(defmethod compute-offsets ((str str) start-pos)
  (setf (offset str) start-pos)
  (+ start-pos (len str)))

(defmethod compute-offsets ((back-reference back-reference) start-pos)
  ;; with enough effort we could possibly do better here, but
  ;; currently we just give up and return NIL
  (declare (ignore start-pos))
  nil)

(defmethod compute-offsets ((filter filter) start-pos)
  (let ((len (len filter)))
    (if len
      (+ start-pos len)
      nil)))

(defmethod compute-offsets ((regex regex) start-pos)
  ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
  ;; WORD-BOUNDARY (which all have zero-length)
  start-pos)
